Read in and process the COVID dataset from the New York Times GitHub repository Create interactive graphs of different types using plot_ly() and ggplotly() functions Customize the hoverinfo and other plot features Create a Choropleth map using plot_geo()
We will work with COVID data downloaded from the New York Times. The dataset consists of COVID-19 cases and deaths in each US state during the course of the COVID epidemic.
**The objective of this lab is to explore relationships between cases, deaths, and population sizes of US states, and plot data to demonstrate this
## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data
## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
### FINISH THE CODE HERE
cv_states <- merge(cv_states, state_pops, by="state")
head, and tail of
the datadim(cv_states)
## [1] 51438 9
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 1 Alabama 2021-07-17 1 559478 11443 1 4887871 96.50939 AL
## 2 Alabama 2021-08-11 1 619752 11689 1 4887871 96.50939 AL
## 3 Alabama 2022-05-25 1 1310513 19651 1 4887871 96.50939 AL
## 4 Alabama 2022-06-18 1 1334981 19696 1 4887871 96.50939 AL
## 5 Alabama 2020-05-11 1 10164 403 1 4887871 96.50939 AL
## 6 Alabama 2020-12-24 1 338801 4676 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 51433 Wyoming 2022-04-20 56 156392 1807 56 577737 5.950611 WY
## 51434 Wyoming 2022-04-19 56 156392 1807 56 577737 5.950611 WY
## 51435 Wyoming 2021-12-31 56 115638 1526 56 577737 5.950611 WY
## 51436 Wyoming 2021-06-03 56 60543 720 56 577737 5.950611 WY
## 51437 Wyoming 2021-09-07 56 78495 879 56 577737 5.950611 WY
## 51438 Wyoming 2021-04-23 56 57696 705 56 577737 5.950611 WY
str(cv_states)
## 'data.frame': 51438 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : IDate, format: "2021-07-17" "2021-08-11" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 559478 619752 1310513 1334981 10164 338801 1004622 1299816 1290692 1534287 ...
## $ deaths : int 11443 11689 19651 19696 403 4676 16641 19545 18998 20558 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
### FINISH THE CODE HERE
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame': 51438 obs. of 9 variables:
## $ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 635 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 440 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 976 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 181 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 801 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 287 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 51075 Wyoming 2022-11-10 56 179366 1917 56 577737 5.950611 WY
## 50602 Wyoming 2022-11-11 56 179366 1917 56 577737 5.950611 WY
## 51311 Wyoming 2022-11-12 56 179366 1917 56 577737 5.950611 WY
## 51285 Wyoming 2022-11-13 56 179366 1917 56 577737 5.950611 WY
## 51300 Wyoming 2022-11-14 56 179366 1917 56 577737 5.950611 WY
## 50750 Wyoming 2022-11-15 56 179838 1924 56 577737 5.950611 WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 635 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 440 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 976 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 181 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 801 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 287 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
## state date fips cases
## Washington : 1030 Min. :2020-01-21 Min. : 1.00 Min. : 1
## Illinois : 1027 1st Qu.:2020-11-04 1st Qu.:16.00 1st Qu.: 90384
## California : 1026 Median :2021-07-09 Median :29.00 Median : 344961
## Arizona : 1025 Mean :2021-07-08 Mean :29.78 Mean : 818955
## Massachusetts: 1019 3rd Qu.:2022-03-13 3rd Qu.:44.00 3rd Qu.: 964554
## Wisconsin : 1015 Max. :2022-11-15 Max. :72.00 Max. :11417891
## (Other) :45296
## deaths geo_id population pop_density
## Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
## 1st Qu.: 1372 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
## Median : 5140 Median :29.00 Median : 4468402 Median : 107.860
## Mean :11417 Mean :29.78 Mean : 6403767 Mean : 422.948
## 3rd Qu.:14431 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
## Max. :97176 Max. :72.00 Max. :39557045 Max. :11490.120
## NA's :978
## abb
## WA : 1030
## IL : 1027
## CA : 1026
## AZ : 1025
## MA : 1019
## WI : 1015
## (Other):45296
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2022-11-15"
new_cases and new_deaths and
correct outliersnew_cases, and new deaths,
new_deaths:
new_cases equal to the difference
between cases on date i and date i-1, starting on date i=2plotly for EDA: See if there are outliers or values
that don’t make sense for new_cases and
new_deaths. Which states and which dates have strange
values?new_cases or
new_deaths to 0cases and deaths as cumulative
sum of updated new_cases and new_deaths# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
### FINISH THE CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2022-06-01")
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
### FINISH CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2=NULL
Add population-normalized (by 100,000) variables for each
variable type (rounded to 1 decimal place). Make sure the variables you
calculate are in the correct format (numeric). You can use
the following variable names:
per100k = cases per 100,000 populationnewper100k= new cases per 100,000deathsper100k = deaths per 100,000newdeathsper100k = new deaths per 100,000Add a “naive CFR” variable representing
deaths / cases on each date for each state
Create a dataframe representing values on the most recent date,
cv_states_today, as done in lecture
### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))
plot_ly()plot_ly() representing
pop_density vs. various variables (e.g. cases,
per100k, deaths, deathsper100k)
for each state on most recent date (cv_states_today)
hovermode = "compare"### FINISH CODE HERE
# pop_density vs. cases
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# pop_density vs. deathsper100k
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# Adding hoverinfo
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
hoverinfo = 'text',
text = ~paste( paste(state, ":", sep=""),
paste(" Cases per 100k: ", per100k, sep="") ,
paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"),
xaxis = list(title = "Population Density"),
hovermode = "compare")
ggplotly() and geom_smooth()pop_density vs. newdeathsper100k
create a chart with the same variables using
gglot_ly()geom_smooth()
pop_density is a
correlate of newdeathsper100k?### FINISH CODE HERE
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() + geom_smooth()
ggplotly(p)
naive_CFR for all states
over time using plot_ly()
naive_CFR for
the states that had an increase in September. How have they changed over
time?new_cases and new_deaths together in one plot.
Hint: use add_layer()
### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>%
add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")
Create a heatmap to visualize new_cases for each state
on each date greater than June 1st, 2021 - Start by mapping selected
features in the dataframe into a matrix using the tidyr
package function pivot_wider(), naming the rows and
columns, as done in the lecture notes - Use plot_ly() to
create a heatmap out of this matrix. Which states stand out? - Repeat
with newper100k variable. Now which states stand out? -
Create a second heatmap in which the pattern of new_cases
for each state over time becomes more clear by filtering to only look at
dates every two weeks
### FINISH CODE HERE
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2022-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% dplyr::filter(date>as.Date("2022-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2022-06-15"), as.Date("2022-11-01"), by="2 weeks")
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
naive_CFR by state on
October 15, 2021naive_CFR by state
on most recent datesubplot(). Make sure
the shading is for the same range of values (google is your friend for
this)### For specified date
pick.date = "2022-10-15"
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>%
filter(date==pick.date) %>%
select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100,
paste(state_name, '<br>',
"Cases per 100k: ", newper100k, '<br>',
"Cases: ", cases, '<br>',
"Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Make sure both maps are on the same color scale
shadeLimit <- 35
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k,
text = ~hover,
locations = ~state,
color = ~newper100k,
colors = 'Purples'
)
fig <- fig %>%
colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
fig <- fig %>%
layout(
geo = set_map_details
)
fig_pick.date <- fig
#############
### Map for today's date
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>%
select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100,
paste(state_name, '<br>',
"Cases per 100k: ", newper100k, '<br>',
"Cases: ", cases, '<br>',
"Deaths: ", deaths))
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k,
text = ~hover,
locations = ~state,
color = ~newper100k,
colors = 'Purples'
)
fig <- fig %>%
colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
fig <- fig %>%
layout(
geo = set_map_details
)
fig_Today <- fig
### Plot together
finalfig <- subplot(fig_pick.date, fig_Today, nrows = 2) %>%
layout(showlegend = FALSE,
title = paste('Cases per 100k by State',
'<br>(Hover for value)'),
hovermode = TRUE
) %>%
colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
annotations = list(
list(
x = 0.5,
y = 0.5,
text = pick.date,
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
),
list(
x = 0.5,
y = -0.05,
text = Sys.Date(),
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "bottom",
showarrow = FALSE
))
finalfig <- finalfig %>%layout(annotations = annotations)
finalfig
Ideally, we would not repeat the colorbar twice.